#Libraries imported and utilized
import pandas as pd
import numpy as np
import yfinance as yf
from termcolor import colored
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import datetime
import pandas_ta as ta
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
Stock_Data = yf.download('TATAMOTORS.NS', start='2010-01-04', end='2023-08-05')
Stock_Data.to_csv('TATAMOTORS.csv')
[*********************100%%**********************] 1 of 1 completed
#Importing the CSV file using pandas
stock_ = pd.read_csv('TATAMOTORS.csv')
print(colored(f"{' '*57}\n The downloaded stock data has {stock_.shape[0]} \
rows and {stock_.shape[1]} columns \n{' '*57}", 'green', attrs=['bold','reverse']))
The downloaded stock data has 3363 rows and 7 columns
#Checking the data types at column index
print(colored(f"\n Data types of dataset parameters. ", 'green', attrs=['bold','reverse']))
for i,j in enumerate(stock_.columns):
print(f'{i} - {j} - {stock_[j].dtypes}')
Data types of dataset parameters.
0 - Date - object
1 - Open - float64
2 - High - float64
3 - Low - float64
4 - Close - float64
5 - Adj Close - float64
6 - Volume - float64
#Changing the Date column data-type to datetime
stock_['Date'] = pd.to_datetime(stock_['Date'])
print(colored(f"\nThe Date column has been converted to datetime from object type", 'blue', attrs=['bold','reverse']))
#Data distribution on Date
print(colored(f"\nAn overview of the distribution of data based on date", 'green', attrs=['bold','reverse']))
print(f"{'-'*55}\n{stock_.Date.describe()}\n{'-'*55}")
print(colored(f"Hence the stock recording are\n\tfrom {stock_.Date.min().date()} to {stock_.Date.max().date()}", 'black', attrs=['bold']))
print(f"{'-'*22}{'^'*11}{'-'*22}")
The Date column has been converted to datetime from object type An overview of the distribution of data based on date ------------------------------------------------------- count 3363 mean 2016-10-16 07:58:42.925959168 min 2010-01-04 00:00:00 25% 2013-05-20 12:00:00 50% 2016-10-19 00:00:00 75% 2020-03-17 12:00:00 max 2023-08-04 00:00:00 Name: Date, dtype: object ------------------------------------------------------- Hence the stock recording are from 2010-01-04 to 2023-08-04 ----------------------^^^^^^^^^^^----------------------
fig = px.line(y = stock_['Adj Close'], x = stock_['Date'], title='TataMotors')
fig.update_layout(template='plotly_dark')
fig.update_traces(line=dict(color='#717D7E'))
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Adjusted Close')
fig.show(renderer='notebook')
# Selecting stocks after 2013 for Analysis and future predictions
start_date = pd.to_datetime('2013-01-01')
df = stock_[stock_['Date']>=start_date]
#print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns')
print(colored(f"{'-'*65}\n we would be using data only after {start_date.year}.{' '*25}\n\t\
Therefore the new data is from {df.Date.min().date()} to {df.Date.max().date()} \n{'-'*65}", 'magenta', attrs=['bold','reverse']))
print(colored(f"length of data is now {len(df)}", 'black', attrs=['bold','underline']))
----------------------------------------------------------------- we would be using data only after 2013. Therefore the new data is from 2013-01-01 to 2023-08-04 ----------------------------------------------------------------- length of data is now 2617
#Setting Date to be the index of the dataframe to establish a sequence of time-based data.
#The LSTM model uses past time steps to predict future values
df.set_index('Date', inplace = True)
df.sort_index(inplace=True)
print(colored(f"\n{' '*6}Yearly Trading records{' '*6}", 'green', attrs=['bold','reverse']))
y = 2012
print(colored(f"\nYEAR{'-'*5}TRADING DAYS{'-'*2}NO TRADING", 'green', attrs=['bold']))
while y<2024:
l = len(df[f'{y}-01-01':f'{y}-12-31'])
print(f"{y}--|||--{l} dys--|||--{261-l} dys")#261 Workingdays in a year
y+=1
Yearly Trading records YEAR-----TRADING DAYS--NO TRADING 2012--|||--0 dys--|||--261 dys 2013--|||--248 dys--|||--13 dys 2014--|||--245 dys--|||--16 dys 2015--|||--247 dys--|||--14 dys 2016--|||--246 dys--|||--15 dys 2017--|||--248 dys--|||--13 dys 2018--|||--246 dys--|||--15 dys 2019--|||--243 dys--|||--18 dys 2020--|||--251 dys--|||--10 dys 2021--|||--248 dys--|||--13 dys 2022--|||--248 dys--|||--13 dys 2023--|||--147 dys--|||--114 dys
#Checking for null values
print(colored(f"\n Null values per column are as follows:-", 'green', attrs=['bold','reverse']))
df.isna().sum()
Null values per column are as follows:-
Open 2 High 2 Low 2 Close 2 Adj Close 2 Volume 2 dtype: int64
# Checking all null vlaue rows
print(colored(f"\n{' '*4}Checking all the entries with null value{' '*4}", 'green', attrs=['bold', 'reverse']))
display(df[df.isna().any(axis=1)])
print(colored(f'\nThe length of null-data is {len(df[df.isna().any(axis=1)])} entries', 'black', attrs=['bold','underline']))
print(f"\n{'-'*22}{'^'*11}{'-'*22}")
Checking all the entries with null value
| Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2014-03-22 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2015-02-28 | NaN | NaN | NaN | NaN | NaN | NaN |
The length of null-data is 2 entries
----------------------^^^^^^^^^^^----------------------
#Dropping the null values and running checking the
df.dropna(axis =0, inplace=True)
#Generating statistical summary of the data
print(colored(f"\n{' '*20}Statistical summary of the data{' '*20}", 'green', attrs=['bold', 'reverse']))
display(df.describe())
#A general peak at the dataframe
print(colored(f"\n{' '*23}A peak at the dataframe{' '*23}", 'green', attrs=['bold', 'reverse']))
df
Statistical summary of the data
| Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|
| count | 2615.000000 | 2615.000000 | 2615.000000 | 2615.000000 | 2615.000000 | 2.615000e+03 |
| mean | 354.842676 | 359.743554 | 349.271249 | 354.244936 | 352.775674 | 2.172449e+07 |
| std | 131.910147 | 132.952652 | 130.580421 | 131.722710 | 131.351571 | 2.957094e+07 |
| min | 66.500000 | 66.900002 | 63.500000 | 65.300003 | 65.097115 | 0.000000e+00 |
| 25% | 276.000000 | 279.824707 | 268.682816 | 275.575012 | 273.923676 | 6.395300e+06 |
| 50% | 381.649994 | 387.000000 | 376.462067 | 381.450012 | 379.218079 | 1.049052e+07 |
| 75% | 452.024994 | 458.324997 | 446.174988 | 451.574997 | 450.131775 | 2.344143e+07 |
| max | 647.000000 | 665.400024 | 642.000000 | 644.299988 | 644.299988 | 3.905778e+08 |
A peak at the dataframe
| Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2013-01-01 | 312.646545 | 314.922150 | 311.162476 | 313.388580 | 308.856689 | 3980484.0 |
| 2013-01-02 | 315.120026 | 317.544037 | 312.893890 | 314.625336 | 310.075562 | 6929285.0 |
| 2013-01-03 | 316.604095 | 318.335541 | 313.784363 | 314.724274 | 310.173035 | 6165017.0 |
| 2013-01-04 | 315.120026 | 315.120026 | 309.876282 | 312.052917 | 307.540314 | 8965650.0 |
| 2013-01-07 | 314.427460 | 315.218964 | 309.282623 | 309.925751 | 305.443909 | 4943021.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 2023-07-31 | 640.200012 | 645.549988 | 635.900024 | 644.299988 | 644.299988 | 7502361.0 |
| 2023-08-01 | 645.000000 | 652.900024 | 642.000000 | 643.650024 | 643.650024 | 8924219.0 |
| 2023-08-02 | 644.000000 | 644.299988 | 616.450012 | 622.650024 | 622.650024 | 15762687.0 |
| 2023-08-03 | 623.000000 | 626.450012 | 614.150024 | 618.950012 | 618.950012 | 11458920.0 |
| 2023-08-04 | 624.849976 | 625.000000 | 613.299988 | 615.000000 | 615.000000 | 11806739.0 |
2615 rows × 6 columns
# Creating rolling window to analyse Moving-Average(MA) of Closing price of stock
# Analysing MA(weekly), MA(Monthly), MA(Quarterly)
#Rolling Algorithm
print(colored(f"\n{' '*15}Plots corresponding to Moving-Average(weekly), Moving-Average(Monthly), Moving-Average(Quarterly){' '*15}",
'blue', attrs=['bold', 'reverse']))
roll = [7,30,90]
for i in roll:
cn = f'rolling {i}'
df[cn] = df['Adj Close'].rolling(i).mean()
#Plotting the Moving Averages on Seaborn plot
roll_col = ['Adj Close']
for i in df.columns:
if 'roll'in i:
roll_col.append(i)
fig, axs = plt.subplots(2,2, figsize = (15,8), dpi = 500)
color_ = ['blue','orange','green','red']
x, y = 0, 0
for j,i in enumerate(roll_col):
sns.lineplot(df[i], label = i, ax = axs[y,x], color = color_[j])
axs[y,x].set_title(i)
x +=1
if x>1:
y +=1
x=0
fig.suptitle('Moving Average of Stock', fontsize=22)
plt.tight_layout()
Plots corresponding to Moving-Average(weekly), Moving-Average(Monthly), Moving-Average(Quarterly)
#Calculating Daily returns based on percent change between the current and a prior element
#Percent change can provide insights into the volatility and distribution of the stock returns over time.
print(colored(f"\nCalculating Daily returns based on percent{' '*49}\n\t\
Percent change can provide insights into the volatility and distribution of{' '*8}\n\tthe stock returns over time{' '*56}",
'blue', attrs=['bold','reverse','blink']))
fig , ax = plt.subplots(2, figsize = (8,5), dpi=1000)
df['daily return'] = df['Adj Close'].pct_change()
sns.lineplot(df['daily return'], linestyle=":", ax = ax[1])
sns.histplot(data=df, x="daily return", kde = True, ax = ax[0])
ax[0].lines[0].set_color('b')
fig.suptitle('Daily returns')
ax = plt.gca()
plt.tight_layout()
Calculating Daily returns based on percent
Percent change can provide insights into the volatility and distribution of
the stock returns over time
#Checking for Autocorrelation
from statsmodels.graphics import tsaplots
from matplotlib import rcParams
rcParams['figure.figsize'] = 8,3
x = len(df)*0.92
for i in df[['Adj Close', 'Volume']]:
fig = tsaplots.plot_acf(df[i], lags=len(df)-1)
plt.title(f'Autocorrelation of {i}')
print(colored(f" Since all points lie within the confidence interval bands{' '*6}\n\
This could imply that there is no clear pattern or trend in the{' '*1}\n\
stock's price movements that the tsaplots could detect{' '*10}", 'magenta', attrs=['bold','reverse']))
Since all points lie within the confidence interval bands
This could imply that there is no clear pattern or trend in the
stock's price movements that the tsaplots could detect
pip install pandas_ta
Indicators like RSI, ADL, ATR, MOM, MFI, ROC, OBV, and CCI are commonly used in Technical Analysis to help traders and investors make informed decisions about buying and selling stocks. These indicators are based on past price and volume data and provide insights into market trends and the strength of those trends.
By using these indicators, traders may be able to identify potential entry and exit points for trades and make more profitable decisions.
These indicators can be used in LSTM modeling to make predictions about future price movements and return levels in the market.
cn = "Adj Close: Adjusted Close Price, Volume: Trading Volume,Daily Returns ,RSI: Relative Strength Index,ADL: Accumulation/Distribution Line,ATR: Average True Range,MOM: Momentum,MFI: Money Flow Index,ROC: Rate of Change,OBV: On-Balance Volume,CCI: Commodity Channel Index"
indicators = []
for i in cn.split(','):
indicators.append(i.strip())
df['rsi'] = ta.rsi(df['Close'])
df['adl'] = ta.ad(df['High'], df['Low'], df['Close'], df['Volume'])
df['atr'] = ta.atr(df['High'], df['Low'], df['Close'])
df['mom'] = ta.mom(df['Close'])
df['mfi'] = ta.mfi(df['High'], df['Low'], df['Close'], df['Volume'])
df['roc'] = ta.roc(df['Close'])
df['obv'] = ta.obv(df['Close'], df['Volume'])
df['cci'] = ta.cci(df['High'], df['Low'], df['Close'])
df = df.fillna(0)
print(colored(f"\n{' '*3}List of Financial Indicators employed:-{' '*2}",'red', attrs=['bold','reverse']))
indicators[3:]
List of Financial Indicators employed:-
['RSI: Relative Strength Index', 'ADL: Accumulation/Distribution Line', 'ATR: Average True Range', 'MOM: Momentum', 'MFI: Money Flow Index', 'ROC: Rate of Change', 'OBV: On-Balance Volume', 'CCI: Commodity Channel Index']
#Corellations with Adj Close
feature_, correlation_, obj_ = [], [], []
for i in df.drop('Adj Close', axis =1):
try:
x = np.round(df[i].corr(df['Adj Close']),2)
#print(f"{i}'s...corelation with Adj Close is...{x:0.4f}")
feature_.append(i)
correlation_.append(x)
except:
obj_.append(i)
plt.figure(figsize=(15,6), dpi=1000)
plt.title('Feature Correlation with Ajdusted Close',fontdict={'fontsize':18})
ax = sns.barplot(y = feature_, x = correlation_)
ax.bar_label(ax.containers[0])
ax.set_xlabel("Correlation")
Text(0.5, 0, 'Correlation')
col = ['Adj Close','Volume']+df.columns[9:].tolist()
x = 0
clr = ['#16A0B0','#1651B0']
fig, axs = plt.subplots(len(col), figsize = (10,20),dpi = 1000)
c = 0
for j,i in enumerate(col):
sns.lineplot(df[i], ax = axs[x], color = clr[x%2])
axs[x].set_title(indicators[j], color = '#b5007d')
x += 1
for ax in axs.flat:
ax.set_xlabel(None)
plt.subplots_adjust(hspace=1.12)
RSI (Relative Strength Index) - measures the momentum of a stock's price and helps to determine overbought or oversold conditions.
Parameters required to determine:
close
ADL (Accumulation/Distribution Line) - measures the flow of money into or out of a stock.
Parameters required to determine: high,low,close,volume
ATR (Average True Range) - shows the volatility of a stock and helps to identify potential price breakouts or breakdowns.
Parameters required to determine: high, low, close
MOM (Momentum) - measures the rate of change in a stock's price over a specified period of time.
Parameters required to determine: close
MFI (Money Flow Index) - combines volume and price data to measure buying and selling pressure.
Parameters required to determine: high, low, close, volume
ROC (Rate of Change) - calculates the percentage change in a stock's price over a specified period of time.
Parameters required to determine: close
OBV (On-Balance Volume) - measures buying and selling pressure by adding volume on up days and subtracting volume on down days.
Parameters required to determine: Close, volume
CCI (Commodity Channel Index) - measures a stock's deviation from its average price over a specified period of time, helping to identify potential trend reversals.
Parameters required to determine: high, low, close
display(df.columns)
display(df.shape)
Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'rolling 7',
'rolling 30', 'rolling 90', 'daily return', 'rsi', 'adl', 'atr', 'mom',
'mfi', 'roc', 'obv', 'cci'],
dtype='object')
(2615, 18)
data = df[['Adj Close', 'daily return','rsi', 'adl', 'atr', 'mom', 'mfi', 'roc', 'obv', 'cci']]
#data = df[['Adj Close','adl','atr', 'mfi','mom', 'obv', 'cci']]
data = data[14:]
corr_matrix = data.corr()
mask = np.logical_or(corr_matrix > 0.7, corr_matrix < -0.7)
plt.figure(figsize=(7,4))
sns.heatmap(corr_matrix, annot=True, fmt='.2f',linecolor='black',linewidths=0.5 ,mask=np.logical_not(mask))
plt.yticks(rotation = 360)
plt.xticks(rotation = 90)
plt.title("Veriable Correlation with mask")
print(data.shape)
#rsi,mom,roc,cci
(2601, 10)
#Dropping highly correlated parameters
data = data.drop(['rsi','mom','mfi','roc','cci'], axis =1)
sns.heatmap(data.corr(), annot=True, fmt='.2f',linecolor='black',linewidths=0.5)
data.shape
(2601, 5)
print(colored(f" The parameter Adjusted Close i.e. Adj Close will be the target veriable for model building and prediction{' '*14}\n\
\tas it is the closing price after adjustments for all applicable splits and dividend distributions.{' '*2}",
'light_cyan', attrs=['bold','reverse','blink']))
print(colored(f"\n {' '*4}The data for futher processing is of shape {data.shape}{' '*5}",
'green', attrs=['bold','reverse','blink']))
The parameter Adjusted Close i.e. Adj Close will be the target veriable for model building and prediction as it is the closing price after adjustments for all applicable splits and dividend distributions. The data for futher processing is of shape (2601, 5)
#Splitting the data into Training, Validation and Test
print(colored(f"\n\tThe Train-Validation-Test Split{' '*9}",
'magenta', attrs=['bold','reverse','blink']))
df_close = data.copy()
#df_close = df['daily return']
training_len = int(round(len(df_close)*0.80,0))
validation_len = int(round(len(df_close)*0.12,0))
train_ = df_close[:training_len+validation_len]
test_ = df_close[training_len+validation_len:]
sns.set(rc={'figure.figsize':(8,6)})
sns.lineplot(train_.iloc[:training_len,0], label = 'Train Split')
sns.lineplot(train_.iloc[training_len:training_len+validation_len,0], label = 'Validation Split')
sns.lineplot(test_.iloc[:,0], label = 'Test Split')
print(f"\n{'-'*20}{'*'*8}{'-'*20}")
print(f'\033[1;34mLength of train data is {len(train_[:training_len])}\033[0m,'
f'\n\033[1;33mLength of Validation data is {len(train_[training_len:training_len+validation_len])}\033[0m,'
f'\n\033[1;32mLength of Test data is {len(test_)}\033[0m')
print(f"{'-'*20}{'^'*8}{'-'*20}")
The Train-Validation-Test Split --------------------********-------------------- Length of train data is 2081, Length of Validation data is 312, Length of Test data is 208 --------------------^^^^^^^^--------------------
#Scaling the data in a window fashion
#Scaling the data allows the LSTM model to optimize its weights and biases based
#on the distribution of data available within the current window.
#Minmax scaler shall be used since it scales the data between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
#The scaling window algorithm
def scaling_data(data_input, w=1):
data1 = data_input.copy()
data = np.array(data1).reshape(-1,1)
for i in range(0,len(data)+1,w):
scaler.fit(data[i:i+w])
data[i:i+w] = scaler.transform(data[i:i+w])
return data.reshape(-1)
scaler = MinMaxScaler(feature_range=(0,1))
scaler_y = MinMaxScaler(feature_range=(0,1))
#Scaling for Multivariate LSTM modeling
train_scaled = scaler.fit_transform(train_[:training_len])
val_scaled = scaler.transform(train_[training_len:training_len+validation_len])
test_scaled = scaler.transform(test_)
scaler_ys = MinMaxScaler(feature_range=(0,1)) scaler_y = scaler_ys.fit(np.array(train_.iloc[:training_len,0]).reshape(-1,1))
#Scaling for Univariate LSTM modeling
train_scaled_uni = scaler_y.fit_transform(np.array(train_.iloc[:training_len,0]).reshape(-1,1)).reshape(-1)
val_scaled_uni = scaler_y.transform(np.array(train_.iloc[training_len:training_len+validation_len,0]).reshape(-1,1)).reshape(-1)
test_scaled_uni = scaler_y.transform(np.array(test_.iloc[:,0]).reshape(-1,1)).reshape(-1)
train_scaled_uni.shape
(2081,)
train_scaled.shape, df_close.shape, data.shape
((2081, 5), (2601, 5), (2601, 5))
#train_data = train_scaled[:training_len]
#validation_data = train_scaled[training_len:training_len+validation_len]
#test_scaled
date_train = df_close.index[:training_len]
date_validation = df_close.index[training_len:training_len+validation_len]
date_test = df_close.index[training_len+validation_len:]
print(f"\nTraining data is dated from {date_train[0].date()} to {date_train[-1].date()} i.e. {len(date_train)} days\n{'-'*75}\
\nValidation will begin from {date_validation[0].date()} to {date_validation[-1].date()} i.e {len(date_validation)} days\n{'-'*75}\
\nlastly, Test data is from {date_test[0].date()} to {date_test[-1].date()} i.e {len(date_test)} days\n{'-'*32}{'^'*11}{'-'*32}")
Training data is dated from 2013-01-21 to 2021-07-01 i.e. 2081 days --------------------------------------------------------------------------- Validation will begin from 2021-07-02 to 2022-10-03 i.e 312 days --------------------------------------------------------------------------- lastly, Test data is from 2022-10-04 to 2023-08-04 i.e 208 days --------------------------------^^^^^^^^^^^--------------------------------
#LSTM uses previous day's data topredict future data points hence we generate columns of n previous dates
#Data Generate Algorithm for Multivariate-LSTM
def generate_data(data,date_df, w):
X, Y, D = [], [], []
for i in range(len(data)-w-1):
x = data[i:i+w]
X.append(x)
Y.append(data[i+w,0])
D.append(date_df[i+w])
return np.array(X), np.array(Y), np.array(D)
#Data Generate Algorithm for Univariate-LSTM
def generate_data_uni(data,date_df, w):
X, Y, D = [], [], []
for i in range(len(data)-w-1):
x = data[i:i+w]
X.append(x)
Y.append(data[i+w])
D.append(date_df[i+w])
return np.array(X), np.array(Y), np.array(D)
#Stating the number of previous day data to be used in the duration_ veriable
#for Multivariate-LSTM
duration_ = 10
X_train, y_train, d_train = generate_data(train_scaled,date_train,duration_)
X_val, y_val, d_val = generate_data(val_scaled,date_validation,duration_)
X_test, y_test, d_test = generate_data(test_scaled,date_test,duration_)
print("For Multivariate LSTM modelling")
print(f"\nThe splitting is now shaped as follows after running it throught the generate_data function\n\
\tThe data is currently structured as a 3D tensor with a dimension of {X_train.shape[1]} timestamps and {X_train.shape[2]} features")
print('-'*52)
print("train shape:",X_train.shape, y_train.shape, d_train.shape)
print('validation:', X_val.shape, y_val.shape, d_val.shape)
print('test shape:', X_test.shape, y_test.shape, d_test.shape)
For Multivariate LSTM modelling The splitting is now shaped as follows after running it throught the generate_data function The data is currently structured as a 3D tensor with a dimension of 10 timestamps and 5 features ---------------------------------------------------- train shape: (2070, 10, 5) (2070,) (2070,) validation: (301, 10, 5) (301,) (301,) test shape: (197, 10, 5) (197,) (197,)
#for Univariate-LSTM
duration_ = 10
X_train_u, y_train_u, d_train = generate_data_uni(train_scaled_uni,date_train,duration_)
X_val_u, y_val_u, d_val = generate_data_uni(val_scaled_uni,date_validation,duration_)
X_test_u, y_test_u, d_test = generate_data_uni(test_scaled_uni,date_test,duration_)
print("For Univariate LSTM modelling")
print(f"\nThe splitting is now shaped as follows after running it throught the generate_data_univerate function\n\
\t{duration_} columns generated, having adj close price prior to {duration_} days will be used as X_train while next-day will be y_train\
\n\nThe corresponding are the X,Y and Date array shapes")
print('-'*52)
print("train shape:",X_train_u.shape, y_train_u.shape, d_train.shape)
print('validation:', X_val_u.shape, y_val_u.shape, d_val.shape)
print('test shape:', X_test_u.shape, y_test_u.shape, d_test.shape)
For Univariate LSTM modelling The splitting is now shaped as follows after running it throught the generate_data_univerate function 10 columns generated, having adj close price prior to 10 days will be used as X_train while next-day will be y_train The corresponding are the X,Y and Date array shapes ---------------------------------------------------- train shape: (2070, 10) (2070,) (2070,) validation: (301, 10) (301,) (301,) test shape: (197, 10) (197,) (197,)
#Reshaping the data into 3D for Univariate LSTM modelling
X_train_u = X_train_u.reshape(X_train_u.shape[0],X_train_u.shape[1],1)
X_val_u = X_val_u.reshape(X_val_u.shape[0],X_val_u.shape[1],1)
X_test_u = X_test_u.reshape(X_test_u.shape[0], X_test_u.shape[1],1)
print(colored(f"\n Converting data into a 3d array for Univariate LSTM \n",'light_red', attrs=['bold','reverse','blink']))
#print("Shape of train, validation and test are now as follows:\n",X_train.shape, X_val.shape, X_test.shape)
print("train data shape for univerate:","X:",X_train_u.shape,"|","y:",y_train_u.shape)
print('validation shape for univerate:',"X: ",X_val_u.shape,"|","y:",y_val_u.shape)
print('test data shape for univerate:'," X: ",X_test_u.shape,"|","y:",y_test_u.shape)
Converting data into a 3d array for Univariate LSTM
train data shape for univerate: X: (2070, 10, 1) | y: (2070,)
validation shape for univerate: X: (301, 10, 1) | y: (301,)
test data shape for univerate: X: (197, 10, 1) | y: (197,)
#Importing the required libraries for LSTM modelling
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, BatchNormalization
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping
#For Univariate LSTM model
model_u = Sequential(name="univariate_LSTM")
#Input layer: LSTM layer with tanh activation for output gate.
#Sigmoid activation for input gate
model_u.add(LSTM(units=128, activation='tanh', recurrent_activation='sigmoid',
return_sequences=True, input_shape=(X_train_u.shape[1], 1)))
#Second LSTM layer with tanh activation for output gate, forget gate,
#and cell state, and sigmoid activation for input gate
model_u.add(LSTM(units=64, activation='tanh',
recurrent_activation='sigmoid', return_sequences=False))
#A dropout layer with ReLU activation
model_u.add(Dropout(0.2))
#Dense layer with ReLU activation
model_u.add(Dense(units=25, activation='relu'))
#Output layer: Dense layer with default linear activation
model_u.add(Dense(units=1, activation='linear'))
model_u.compile(loss='mse', optimizer='adam')
model_u.summary()
Model: "univariate_LSTM"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 10, 128) 66560
lstm_1 (LSTM) (None, 64) 49408
dropout (Dropout) (None, 64) 0
dense (Dense) (None, 25) 1625
dense_1 (Dense) (None, 1) 26
=================================================================
Total params: 117,619
Trainable params: 117,619
Non-trainable params: 0
_________________________________________________________________
# Model for Univariate LSTM modelling #Best model #window was set to 7, entire data from 2010 was used and epoch =3 #rmse = 10.060 model_u = Sequential(name="univariate_LSTM") model_u.add(LSTM(128, return_sequences=True, input_shape= (X_train_u.shape[1], 1))) #model_u.add(Dropout(0.2)) model_u.add(LSTM(64, return_sequences=False)) model_u.add(Dropout(0.2)) model_u.add(Dense(25)) model_u.add(Dense(1)) model_u.compile(loss = 'mse', optimizer = 'adam') model_u.summary()
# Model for Multiverate LSTM model
model = Sequential(name="Multivariate-LSTM")
#Input layer: LSTM layer with default tanh and sigmoid activations
model.add(LSTM(units=128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
#A dropout layer with default ReLU activation
model.add(Dropout(0.3))
#Another LSTM layer with default tanh and sigmoid activations
model.add(LSTM(units=64, return_sequences=False))
#dropout layer with default ReLU activation
model.add(Dropout(0.2))
#A dense layer with default linear activation
model.add(Dense(units=25, activation='linear'))
#One Batch normalization layer
model.add(BatchNormalization())
#Dense layer with default linear activation
model.add(Dense(units=12, activation='linear'))
#Output layer: Dense layer with default linear activation
model.add(Dense(units=1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
model.summary()
Model: "Multivariate-LSTM"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_2 (LSTM) (None, 10, 128) 68608
dropout_1 (Dropout) (None, 10, 128) 0
lstm_3 (LSTM) (None, 64) 49408
dropout_2 (Dropout) (None, 64) 0
dense_2 (Dense) (None, 25) 1625
batch_normalization (BatchN (None, 25) 100
ormalization)
dense_3 (Dense) (None, 12) 312
dense_4 (Dense) (None, 1) 13
=================================================================
Total params: 120,066
Trainable params: 120,016
Non-trainable params: 50
_________________________________________________________________
#Defining the LSTM layers # Model for Multiverate LSTM modelling model = Sequential(name="Multivariate-LSTM") model.add(LSTM(128, return_sequences=True, input_shape= (X_train.shape[1], X_train.shape[2]))) model.add(Dropout(0.3)) model.add(LSTM(64, return_sequences=False)) model.add(Dropout(0.2)) #model.add(BatchNormalization()) model.add(Dense(25)) model.add(BatchNormalization()) model.add(Dense(12)) model.add(Dense(1)) #model.add(LSTM(24, return_sequences=False)) #model.add(BatchNormalization()) #model.add(Dropout(0.2)) #model.add(Dense(12)) #model.add(Dense(1)) model.compile(loss = 'mse', optimizer = 'adam') model.summary() # Define early stopping callback early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto')
try:
del model_u
print('deleted model')
except:
print('model already deleted')
try:
del loss_history_u
print('deleted history')
except:
print('history already deleted')
#Algorithm I used to append historry to check for best epochs
def com_his(history_,m=""):
df_new = pd.DataFrame(history_.history)
try:
if m == "univ":
dff = pd.concat([loss_history_u, df_new], axis=0, ignore_index=True)
else:
dff = pd.concat([loss_history, df_new], axis=0, ignore_index=True)
except:
dff = df_new.copy()
dff.set_index([pd.Index(list(range(1,len(dff)+1)))], inplace = True)
return dff
#The algorithm is designed to evaluate the performance of the model and
# gain insights on it's performance when applied to testing and training data.
from matplotlib.gridspec import GridSpec
def model_efficiency(X_test,X_train,y_train,y_test,loss_history,model):
predict_test = model.predict(X_test)
predict_train = model.predict(X_train)
#rmse_test_s = mean_squared_error(y_test, predict_test, squared=False)
#rmse_train_s = mean_squared_error(y_train, predict_train, squared=False)
rmse_test = mean_squared_error(scaler_y.inverse_transform(y_test.reshape(-1,1)).reshape(-1),
scaler_y.inverse_transform(predict_test).reshape(-1), squared=False)
rmse_train = mean_squared_error(scaler_y.inverse_transform(y_train.reshape(-1,1)).reshape(-1),
scaler_y.inverse_transform(predict_train).reshape(-1), squared=False)
#print(f'Scaled_Train RMSE = {rmse_train_s:0.3f}\nScaled_Test RMSE = {rmse_test_s:0.3f}')
print(f'\nConsidering {X_train.shape[1]} previous date data for model training the performance is as follows\nAfter {len(loss_history)} epochs,')
print(f'Train RMSE = {rmse_train:0.3f}\nTest RMSE = {rmse_test:0.3f}')
fig = plt.figure(figsize=(6,6))
gs = fig.add_gridspec(nrows=2, ncols=2)
ax1 = fig.add_subplot(gs[0, 0:2])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
sns.lineplot(loss_history, ax=ax1)
ax1.set_ylabel("Loss")
ax1.set_xlabel("Epoch")
ax1.set_title('Model Loss')
sns.lineplot(x =d_test,y = scaler_y.inverse_transform(y_test.reshape(-1,1)).reshape(-1), label = 'True', ax=ax2)
sns.lineplot(x =d_test,y = scaler_y.inverse_transform(predict_test).reshape(-1), label = 'Predict', ax=ax2)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=90)
ax2.set_title('Test fit')
sns.lineplot(x =d_train,y = scaler_y.inverse_transform(y_train.reshape(-1,1)).reshape(-1), label = 'True', ax=ax3)
sns.lineplot(x =d_train,y = scaler_y.inverse_transform(predict_train).reshape(-1), label = 'Predict', ax=ax3)
ax3.set_title('Train fit')
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=90)
plt.tight_layout()
return rmse_train, rmse_test
#Fitting the data to the model: for Univariate-LSTM
#64 batch: 25 epoch+5
#100 batch: 15 epoch-5
#200 batch: 03 epoch+4
history_u = model_u.fit(X_train_u,y_train_u, validation_data=(X_val_u,y_val_u),
epochs=7, batch_size=200, shuffle = False)
loss_history_u = com_his(history_u,"univ")
Epoch 1/7 11/11 [==============================] - 0s 26ms/step - loss: 0.0018 - val_loss: 0.0023 Epoch 2/7 11/11 [==============================] - 0s 24ms/step - loss: 0.0017 - val_loss: 0.0028 Epoch 3/7 11/11 [==============================] - 0s 25ms/step - loss: 0.0019 - val_loss: 0.0024 Epoch 4/7 11/11 [==============================] - 0s 26ms/step - loss: 0.0016 - val_loss: 0.0030 Epoch 5/7 11/11 [==============================] - 0s 26ms/step - loss: 0.0020 - val_loss: 0.0024 Epoch 6/7 11/11 [==============================] - 0s 25ms/step - loss: 0.0016 - val_loss: 0.0028 Epoch 7/7 11/11 [==============================] - 0s 28ms/step - loss: 0.0018 - val_loss: 0.0023
#Univerate model Performance(The Function has been defined below)
print(colored(f"\n{' '*30}Univariate LSTM Model{' '*30}", 'yellow', attrs=['bold','reverse']))
rmse_train,rmse_test = model_efficiency(X_test_u,X_train_u,y_train_u,y_test_u,loss_history_u, model_u)
Univariate LSTM Model
7/7 [==============================] - 0s 4ms/step
65/65 [==============================] - 0s 3ms/step
Considering 10 previous date data for model training the performance is as follows
After 47 epochs,
Train RMSE = 20.184
Test RMSE = 20.226
#Fitting the data to the model: for Multivariate-LSTM
history = model.fit(X_train,y_train, validation_data=(X_val,y_val), epochs=10, batch_size=768, shuffle = False)
loss_history = com_his(history)
Epoch 1/10 3/3 [==============================] - 0s 106ms/step - loss: 0.0310 - val_loss: 0.0025 Epoch 2/10 3/3 [==============================] - 0s 99ms/step - loss: 0.0309 - val_loss: 0.0025 Epoch 3/10 3/3 [==============================] - 0s 96ms/step - loss: 0.0308 - val_loss: 0.0017 Epoch 4/10 3/3 [==============================] - 0s 102ms/step - loss: 0.0309 - val_loss: 0.0047 Epoch 5/10 3/3 [==============================] - 0s 95ms/step - loss: 0.0308 - val_loss: 0.0018 Epoch 6/10 3/3 [==============================] - 0s 101ms/step - loss: 0.0309 - val_loss: 0.0016 Epoch 7/10 3/3 [==============================] - 0s 100ms/step - loss: 0.0309 - val_loss: 0.0023 Epoch 8/10 3/3 [==============================] - 0s 97ms/step - loss: 0.0309 - val_loss: 0.0013 Epoch 9/10 3/3 [==============================] - 0s 97ms/step - loss: 0.0310 - val_loss: 0.0014 Epoch 10/10 3/3 [==============================] - 0s 94ms/step - loss: 0.0310 - val_loss: 0.0011
Traning the Multivariate model for:-
Gave the best fit on test data with RMSE as under
#Multivariate model Performance
print(colored(f"\n{' '*30}Multivariate LSTM Model{' '*30}", 'cyan', attrs=['bold','reverse']))
rmse_train_Multivariate, rmse_test_Multivariate = model_efficiency(X_test,X_train,y_train,y_test,loss_history,model)
Multivariate LSTM Model
7/7 [==============================] - 0s 4ms/step
65/65 [==============================] - 0s 4ms/step
Considering 10 previous date data for model training the performance is as follows
After 260 epochs,
Train RMSE = 11.643
Test RMSE = 9.527
It is not unusual for the validation loss and training loss to differ in LSTM modeling.
The most important thing to look for is how well the model is performing on the test data.
If the test data fit and test RMSE are good, that indicates that the model is able to accurately predict new data.
#Predicting values from test data and train data to establish model performance predict_test = model.predict(X_test) predict_train = model.predict(X_train)
from sklearn.metrics import mean_squared_error fig, axs = plt.subplots(2) sns.lineplot(x =d_test,y = scaler_y.inverse_transform(y_test.reshape(-1,1)).reshape(-1), label = 'True', ax=axs[0]) sns.lineplot(x =d_test,y = scaler_y.inverse_transform(predict_test).reshape(-1), label = 'Predict', ax=axs[0]) axs[0].set_title('Multiverate Model Test fit') sns.lineplot(x =d_train,y = scaler_y.inverse_transform(y_train.reshape(-1,1)).reshape(-1), label = 'True', ax=axs[1]) sns.lineplot(x =d_train,y = scaler_y.inverse_transform(predict_train).reshape(-1), label = 'Predict', ax=axs[1]) axs[1].set_title('Multiverate Model Train fit') plt.tight_layout()
Scaled_Train RMSE = 0.020 Scaled_Test RMSE = 0.023 Train RMSE = 10.783 Test RMSE = 12.069
#Saving the LSTM model
import pickle
with open('Multivariate_model', 'wb') as file:
pickle.dump(model, file)
with open('Univariate_model', 'wb') as file:
pickle.dump(model_u, file)
# loading the saved model
with open('Multivariate_LSTM_model' , 'rb') as file:
Multivariate_model = pickle.load(file)
This algorithm calculates the stock's performance from the succeeding "duration_".
Upon executing the algorithm,
the user will be prompted to enter the desired "Date" and the number of days for predictions.
Subsequently, a graph representing the generated predictions will be displayed.
Additionally, there is an option to save the predictions as a CSV file for further reference.
print(colored(f"\n{' '*30}Forecasting with Univariate LSTM Model{' '*30}\n", 'yellow', attrs=['bold','reverse']))
predict_future_()
Forecasting with Univariate LSTM Model
Enter past reference date for predictions or leave blank:
Enter No. of days to predict for; default is 3 days = 2
The Model predicts based on the data of 10 days after reference date,
kindly,
wait for predictions
The model was build on data from 2013-01-01 to 2023-08-04
Train RMSE = 20.184
Test RMSE = 20.226
The predicted values are for 2023-08-05 to 2023-08-06 i.e 2 days
based on the prices from 2023-07-24 to 2023-08-04
print(colored(f"\n{' '*30}Forecasting with Multivariate LSTM Model{' '*30}", 'cyan', attrs=['bold','reverse']))
x1,x2,x3 = predict_condition()
future_value = [pred(x1)[1][0]]
future_value.insert(0,df_close['Adj Close'].tail(1)[0])
future_date = [df_close['Adj Close'].tail(1).index[0],df_close['Adj Close'].tail(1).index[0]+datetime.timedelta(days=1)]
print(f"future value of stock on {(df_close['Adj Close'].tail(1).index[0]+datetime.timedelta(days=1)).date()}\
is predicted to be {future_value[1]:0.3f}")
fig = go.Figure()
fig.add_trace(go.Scatter(x = df_close['Adj Close'].tail(7).index, y = df_close['Adj Close'].tail(7).values, showlegend=False, line=dict(color='dimgray')))
fig.add_trace(go.Scatter(x=future_date, y=future_value, name="Predicted", line=dict(color='hotpink')))
fig.update_layout(title=f'Multiverate LSTM Model prediction', title_font_size=20, title_x=0.5, template="plotly_dark")
fig.show(renderer='notebook')
Forecasting with Multivariate LSTM Model
The Model predicts based on the data of 10
kindly,
wait for predictions
future value of stock on 2023-08-05 is predicted to be 621.562
#dfx = df_close.copy()
df_close = df_close.iloc[:,0]
#df_close = dfx.copy()
#Checks if the dataframe has dates succeding to the give date def predict_condition(date_=""): s = X_train.shape[1] if date_ != "": after_date = pd.Timestamp(date_) pred_data = np.array(df_close[after_date:].head(s)) date = df_close[after_date:].head(s)[-1:].index[0] dft = pd.DataFrame(df_close[after_date:].head(s)) else: pred_data = np.array(df_close[-s:]) date = df_close[-1:].index[0] dft = pd.DataFrame(df_close[-s:]) #Data Check print(f'\nThe Model predicts based on the data of {s} \nkindly,') print(f"choose a day before {after_date-datetime.timedelta(days=(s-len(pred_data)))} as analysis data is short by {s-len(pred_data)} days"\ if len(pred_data) < s else "wait for predictions\n\n") return pred_data, date, dft
#For Multivariate LSTM Model
#Checks if the dataframe has dates succeding to the give date
def predict_condition(date_=""):
s = X_train.shape[1]
if date_ != "":
after_date = pd.Timestamp(date_)
pred_data = np.array(df_close[after_date:].head(s))
date = df_close[after_date:].head(s)[-1:].index[0]
dft = pd.DataFrame(df_close[after_date:].head(s))
else:
pred_data = np.array(df_close[-s:])
date = df_close[-1:].index[0]
dft = pd.DataFrame(df_close[-s:])
#Data Check
print(f'\nThe Model predicts based on the data of {s} \nkindly,')
print(f"choose a day before {after_date-datetime.timedelta(days=(s-len(pred_data)))} as analysis data is short by {s-len(pred_data)} days"\
if len(pred_data) < s else "wait for predictions\n\n")
return pred_data, date, dft
def pred(data):
s = X_train.shape[1]
s2 = X_train.shape[2]
new_data = data[1:]
#n1 = data.reshape(-1,1)
n1 = data
n2 = scaler.transform(n1).reshape(1,s,s2) # reshape shape[1] as per the LSTM mdoel
n3 = model.predict(n2, verbose=0).reshape(-1,1)
n4 = scaler_y.inverse_transform(n3).reshape(-1)
new_data = np.append(new_data,n4)
return new_data, n4
#Predicts the price for n days based on user input
def predict_for(data,date, d=7):
pred_data = data.copy()
x = 0
y = []
yd = []
while x<d:
val = pred(pred_data)[1]
pred_data = pred(pred_data)[0]
y.append(val[0])
increment = datetime.timedelta(days=x+1)
new_date = date + increment
yd.append(new_date)
x +=1
return y, yd
#For Univerate model
def predict_condition(date_=""):
s = X_train_u.shape[1]
if date_ != "":
after_date = pd.Timestamp(date_)
pred_data = np.array(df_close[after_date:].head(s))
date = df_close[after_date:].head(s)[-1:].index[0]
dft = pd.DataFrame(df_close[after_date:].head(s))
else:
pred_data = np.array(df_close[-s:])
date = df_close[-1:].index[0]
dft = pd.DataFrame(df_close[-s:])
#Data Check
print(f'\nThe Model predicts based on the data of {s} days after reference date, \nkindly,')
print(f"choose a day before {after_date-datetime.timedelta(days=(s-len(pred_data)))} as analysis data is short by {s-len(pred_data)} days"\
if len(pred_data) < s else "wait for predictions\n\n")
return pred_data, date, dft
def pred(data):
s = X_train_u.shape[1]
new_data = data[1:]
n1 = data.reshape(-1,1)
n2 = scaler_y.transform(n1).reshape(1,s,1) # reshape shape[1] as per the LSTM mdoel
n3 = model_u.predict(n2, verbose=0).reshape(-1,1)
n4 = scaler_y.inverse_transform(n3).reshape(-1)
new_data = np.append(new_data,n4)
return new_data, n4
#Predicts the price for n days based on user input
def predict_for(data,date, d=7):
pred_data = data.copy()
x = 0
y = []
yd = []
while x<d:
val = pred(pred_data)[1]
pred_data = pred(pred_data)[0]
y.append(val[0])
increment = datetime.timedelta(days=x+1)
new_date = date + increment
yd.append(new_date)
x +=1
return y, yd
# Plot using seaborn #Plots graphs based on the predictions(in red) in continuation to previous data points(in blue) def plot_fututre(df_future,d,dfp): st, sp = df_future.index[0].date(), df_future.index[len(df_future)-len(dfp)-1].date() spd = df_future.index[len(df_future)-len(dfp)-1+d].date() sns.set(rc={'figure.figsize':(8,6)}) sns.lineplot(df_close[st:spd], label = '', color = 'white') s = X_train.shape[1] sns.lineplot(data = df_future[:-d], x = df_future.index[:-d], y = 'Adj Close', label = 'True_past') sns.lineplot(data = df_future[-d-1:] , x = df_future.index[-d-1:], y = 'Adj Close', label = 'Predicted_Future', color = 'red') plt.xticks(rotation = 25) plt.title(f'Future Predictions for {len(df_future[-d:])} days\nagainst {len(df_future[:-d])} previous days') print(f'The model was build on data from {df.index[0].date()} to {df.index[-1].date()}') print(f'Train RMSE = {rmse_train:0.3f}\nTest RMSE = {rmse_test:0.3f}\n') print(f'\nThe predicted values are for {dfp.index[0].date()} to {dfp.index[-1].date()} i.e {d} days') print(f'based on the prices from {st} to {sp}\n')
# Plot using Plotly
#Plots graphs based on the predictions(in red) in continuation to previous data points(in blue)
def plot_fututre(df_future,d,dfp):
st, sp = df_future.index[0].date(), df_future.index[len(df_future)-len(dfp)-1].date()
spd = df_future.index[len(df_future)-len(dfp)-1+d].date()
x = spd + datetime.timedelta(days=3)
fig = go.Figure()
fig.add_trace(go.Scatter(x = df_close[st:x].index, y = df_close[st:x].values, showlegend=False, line=dict(color='dimgray')))
fig.add_trace(go.Scatter(x=df_future[:-d].index, y=df_future[:-d].values.reshape(-1), name="True_past", line=dict(color='dodgerblue')))
fig.add_trace(go.Scatter(x=df_future[-d-1:].index, y=df_future[-d-1:].values.reshape(-1), name="Future_Predict", line=dict(color='hotpink')))
fig.update_layout(title=f'Future Predictions for {len(df_future[-d:])} days against {len(df_future[:-d])} previous days', title_font_size=20, title_x=0.5, template="plotly_dark")
print(f'The model was build on data from {df.index[0].date()} to {df.index[-1].date()}')
print(f'Train RMSE = {rmse_train:0.3f}\nTest RMSE = {rmse_test:0.3f}\n')
print(f'\nThe predicted values are for {dfp.index[0].date()} to {dfp.index[-1].date()} i.e {d} days')
print(f'based on the prices from {st} to {sp}\n')
fig.show(renderer='notebook')
#Combining the above for user friendly experiance
def predict_future_():
Start_Date, predict_days = input("Enter past reference date for predictions or leave blank: "), input('Enter No. of days to predict for; default is 3 days = ')
#Start_Date, predict_days = "2022-09-27", '30'
predict_days = [int(predict_days) if predict_days != "" else 3][0]
d = predict_days
pred_data, date, dft = predict_condition(date_=Start_Date)
fut_val, fut_date = predict_for(pred_data,date,predict_days)
dfp = pd.DataFrame(zip(fut_date, fut_val), columns = ['Date','Adj Close']).set_index('Date')
df_future = pd.concat([dft, dfp])
plot_fututre(df_future,d,dfp)
#return df_future, predict_days
# on activating the above code line: will return the predictions in a dataframe
# which can be then exported to csv file